library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggExtra) ; library(ggpubr)
library(biomaRt) ; library(DESeq2) ; library(sva) ; library(WGCNA) ; library(vsn)
library(dendextend) ; library(expss)
library(knitr) ; library(kableExtra)

Raw data



Dataset downloaded from Arkinglab website in the Transcriptome analysis reveals dysregulation of innate immune response genes and neuronal activity-dependent genes in autism section.

Load and annotate data

# Load csvs
datExpr = read.delim('./../Data/datExpr.csv')
datMeta = read.delim('./../Data/datPheno.csv')

# Create dataset with gene information
datGenes = data.frame('ensembl_gene_id' = substr(datExpr$Gene, 1, 15), 
                      'hgnc_symbol' = substring(datExpr$Gene, 17))
rownames(datExpr) = datGenes$ensembl_gene_id
datExpr$Gene = NULL


### CLEAN METADATA DATA FRAME
datMeta = datMeta %>% dplyr::select('ID', 'case', 'sampleid', 'brainregion', 'positiononplate', 
                                    'Gender', 'Age', 'SiteHM', 'RIN', 'PMI', 'Dx') %>%
          mutate(brainregion = substr(ID,1,4)) %>%
          mutate(Brain_lobe = ifelse(brainregion=='ba19', 'Occipital', 'Frontal'),
                 Diagnosis = ifelse(Dx=='Autism', 'ASD', 'CTL')) %>%
          mutate(Diagnosis = factor(Diagnosis, levels = c('CTL','ASD'))) %>%
          dplyr::rename(Subject_ID = sampleid, Sex = Gender)



# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# NCBI biotype annotation
NCBI_biotype = read.csv('./../../../NCBI/Data/gene_biotype_info.csv') %>% 
               dplyr::rename('ensembl_gene_id'=Ensembl_gene_identifier, 'gene_biotype'=type_of_gene, 
                             'hgnc_symbol'=Symbol) %>% 
               mutate(gene_biotype = ifelse(gene_biotype=='protein-coding','protein_coding', gene_biotype))


gg_colour_hue = function(n) {
  hues = seq(15, 375, length = n+1)
  pal = hcl(h = hues, l = 65, c = 100)[1:n]
}

rm(GO_annotations)

Check sample composition



Data description taken from the paper Transcriptome analysis reveals dysregulation of innate immune response genes and neuronal activity-dependent genes in autism:

Transcriptomes from 104 human brain cortical tissue samples were resolved using next-generation RNA sequencing technology at single-gene resolution and through co-expressing gene clusters or modules. Multiple cortical tissues corresponding to Brodmann Area 19 (BA19), Brodmann Area 10 (BA10) and Brodmann Area 44 (BA44) were sequenced in 62, 14 and 28 samples, respectively, resulting in a total of 57 (40 unique individuals) control and 47 (32 unique individuals) autism samples.

Brain tissue: Frozen brain samples were acquired through the Autism Tissue Program, with samples originating from two different sites: the Harvard Brain Tissue Resource Center and the NICHD Brain and Tissue Bank at the University of Maryland (Gandal’s data were obtained also from the Autism Tissue Program, specifically from the Harvard Brain Bank).

Sequenced using Illumina’s HiSeq 2000 (Gandal used Illumina HiSeq 2500, they are compatible).


The dataset includes 62069 genes from 120 samples belonging to 72 different subjects.

In the paper they talk about an original number of 110 samples and dropping 6 because of low gene coverage, resulting in 104 samples (which are the ones that are included in datMeta), but the expression dataset has 120 samples.

no_metadata_samples = colnames(datExpr)[! colnames(datExpr) %in% datMeta$ID]
no_metadata_subjects = unique(substring(no_metadata_samples, 6))

Samples without Metadata information: ba10.s11, ba10.s12, ba10.s21, ba10.s24, ba10.s87, ba19.s13, ba19.s21, ba19.s54, ba19.s60, ba19.s87, ba44.s12, ba44.s21, ba44.s23, ba44.s24, ba44.s77, ba44.s87

Samples without metadata but with a Subject ID that is included in the Metadata data frame s11, s13, s60, s23

Since we need the Phenotype information of the samples, I’m going to keep the samples that I can map to an existing Subject ID in the metadata and remove the others

add_metadata_subjects = no_metadata_subjects[no_metadata_subjects %in% datMeta$Subject_ID]
add_metadata_samples = no_metadata_samples[grepl(paste(add_metadata_subjects, collapse='|'),
                                                 no_metadata_samples)]

for(sample in add_metadata_samples){
  new_row = datMeta %>% filter(Subject_ID == strsplit(sample,'\\.')[[1]][2]) %>% dplyr::slice(1) %>%
            mutate(ID = sample, brainregion = strsplit(sample,'\\.')[[1]][1],
                   Brain_lobe = ifelse(strsplit(sample,'\\.')[[1]][1]=='ba19','Occipital','Frontal'))
  datMeta = rbind(datMeta, new_row)
}

keep = substring(colnames(datExpr), 6) %in% datMeta$Subject_ID

datExpr = datExpr[,keep]

# Match order of datExpr columns and datMeta rows
datMeta = datMeta[match(colnames(datExpr), datMeta$ID),]

# Check they are in the same order
if(!all(colnames(datExpr) == datMeta$ID)){
  cat('\norder of samples don\'t match between datExpr and datMeta!\n')
}

rm(no_metadata_subjects, no_metadata_samples, add_metadata_subjects, add_metadata_samples, sample, new_row)

Removing 12 samples (ba10.s14, ba10.s69, ba10.s72, ba19.s13, ba19.s28, ba19.s62, ba44.s1, ba44.s28, ba44.s44, ba44.s48, NA, NA) belonging to subjects with ID s14, s69, s72, s13, s28, s62, s1, s44, s48, NA


The dataset now has 108 samples belonging to 72 different subjects.

rm(keep)

Counts distribution: More than half of the counts are zero and most of the counts are relatively low, but there are some very high outliers

counts = datExpr %>% melt

count_distr = data.frame('Statistic' = c('Min', '1st Quartile', 'Median', 'Mean', '3rd Quartile', 'Max'),
                         'Values' = c(min(counts$value), quantile(counts$value, probs = c(.25, .5)) %>% unname,
                                      mean(counts$value), quantile(counts$value, probs = c(.75)) %>% unname,
                                      max(counts$value)))

count_distr %>% kable(digits = 2, format.args = list(scientific = FALSE)) %>% kable_styling(full_width = F)
Statistic Values
Min 0.00
1st Quartile 0.00
Median 0.00
Mean 250.81
3rd Quartile 8.00
Max 7718873.00
rm(counts, count_distr)


Diagnosis distribution by Sample: There are a few more CTL samples than ASD

table_info = datMeta %>% apply_labels(Diagnosis = 'Diagnosis', Brain_lobe = 'Brain Lobe', 
                                      SiteHM = 'Processing Group', Sex = 'Gender')
cro(table_info$Diagnosis)
 #Total 
 Diagnosis 
   CTL  58
   ASD  50
   #Total cases  108


Diagnosis distribution by Subject: There are more control subjects than ASD ones

cro(table_info$Diagnosis[!duplicated(table_info$Subject_ID)])
 #Total 
 Diagnosis 
   CTL  40
   ASD  32
   #Total cases  72

Brain region distribution: The Occipital lobe has more samples than the Frontal lobe, even though we are combining two brain regions in the Frontal Lobe

cro(table_info$Brain_lobe)
 #Total 
 Brain Lobe 
   Frontal  44
   Occipital  64
   #Total cases  108


Most of the Control samples are from the Occipital lobe, the Autism samples are balanced between both lobes. This could cause problems because Control and Occipital are related

cro(table_info$Diagnosis, list(table_info$Brain_lobe,total()))
 Brain Lobe     #Total 
 Frontal   Occipital   
 Diagnosis 
   CTL  19 39   58
   ASD  25 25   50
   #Total cases  44 64   108


Gender distribution: There are thrice as many Male samples than Female ones

cro(table_info$Sex)
 #Total 
 Gender 
   F  26
   M  82
   #Total cases  108


There is a small imbalance between gender and diagnosis with more males in the control group than in the autism group

cro(table_info$Diagnosis, list(table_info$Sex, total()))
 Gender     #Total 
 F   M   
 Diagnosis 
   CTL  12 46   58
   ASD  14 36   50
   #Total cases  26 82   108


Age distribution: Subjects between 2 and 82 years old with a mean of 18 years old

summary(datMeta$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    8.75   18.00   20.49   22.00   82.00
datMeta_by_subject = datMeta %>% filter(!duplicated(Subject_ID))
datMeta_by_subject %>% ggplot(aes(Age)) +
                       geom_density(alpha=0.2, aes(group = Diagnosis, fill = Diagnosis, color = Diagnosis)) +
                       geom_density(alpha=0.3, fill='gray', color='gray') +
                       theme_minimal()

rm(datMeta_by_subject)



Processing Group and Diagnosis: There is a relation between processing group and Diagnosis! This can be a problem because processing group is an important Batch Correction covariate, but if it’s related to Diagnosis, we risk inadvertedly losing information related to ASD

cro(table_info$Diagnosis, list(table_info$SiteHM,total()))
 Processing Group     #Total 
 H   M   
 Diagnosis 
   CTL  13 45   58
   ASD  38 12   50
   #Total cases  51 57   108
rm(table_info)



Annotate genes with BioMart information



I was originally running this with the feb2014 version of BioMart because that’s the one that Gandal used (and it finds all of the Ensembl IDs, which other versions don’t), but it has some outdated biotype annotations, to fix this I’ll obtain all the information except the biotype label from BioMart in the same way as it had been done before, and then I’ll add the most current biotype label using information from NCBI’s website and then from BioMart in the following way:

  1. Use BioMart to run a query with the original feb2014 version using the Ensembl IDs as keys to obtain all the information except the biotype labels (we are not going to retrieve the gene name from biomart because we already extracted it from datExpr)

  2. Annotate genes with Biotype labels:

    2.1 Use the NCBI annotations downloaded from NCBI’s website and processed in NCBI/RMarkdowns/clean_data.html (there is information for only 26K genes, so some genes will remain unlabelled)

    2.2 Use the current version (jan2020) to obtain the biotype annotations using the Ensembl ID as keys (some genes don’t return a match)

    2.3 For the genes that didn’t return a match, use the current version (jan2020) to obtain the biotype annotations using the gene name as keys

    2.4 For the genes that returned multiple labels, use the feb2014 version with the Ensembl IDs as keys


Note: A small proportion of genes don’t make a match in any of these queries, so they will be lost when we start filtering out genes

labels_source = data.frame('source' = c('NCBI', 'BioMart2020_byID', 'BioMart2020_byGene', 'BioMart2014'),
                                      'n_matches' = rep(0,4))

########################################################################################
# 1. Query archive version

getinfo = c('ensembl_gene_id','chromosome_name','start_position','end_position','strand')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl', 
               host = 'feb2014.archive.ensembl.org')
datGenes_BM = getBM(attributes = getinfo, filters = c('ensembl_gene_id'), values = rownames(datExpr), 
                    mart = mart)

datGenes = datGenes %>% left_join(datGenes_BM, by = 'ensembl_gene_id')

datGenes$length = datGenes$end_position - datGenes$start_position

cat(paste0('1. ', sum(is.na(datGenes$start_position)), '/', nrow(datGenes),
             ' Ensembl IDs weren\'t found in the feb2014 version of BioMart'))
## 1. 1580/62069 Ensembl IDs weren't found in the feb2014 version of BioMart
########################################################################################
########################################################################################
# 2. Get Biotype Labels

cat('2. Add biotype information')
## 2. Add biotype information
########################################################################################
# 2.1 Add NCBI annotations
datGenes = datGenes %>% left_join(NCBI_biotype, by=c('ensembl_gene_id','hgnc_symbol'))

cat(paste0('2.1 ' , sum(is.na(datGenes$gene_biotype)), '/', nrow(datGenes),
             ' Ensembl IDs weren\'t found in the NCBI database'))
## 2.1 42070/62069 Ensembl IDs weren't found in the NCBI database
labels_source$n_matches[1] = sum(!is.na(datGenes$gene_biotype))

########################################################################################
# 2.2 Query current BioMart version for gene_biotype using Ensembl ID as key

getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
               host = 'jan2020.archive.ensembl.org')
datGenes_biotype = getBM(attributes = getinfo, filters = c('ensembl_gene_id'), mart=mart, 
                         values = datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])

cat(paste0('2.2 ' , sum(is.na(datGenes$gene_biotype))-nrow(datGenes_biotype), '/', 
           sum(is.na(datGenes$gene_biotype)),
           ' Ensembl IDs weren\'t found in the jan2020 version of BioMart when querying by Ensembl ID'))
## 2.2 9621/42070 Ensembl IDs weren't found in the jan2020 version of BioMart when querying by Ensembl ID
# Add new gene_biotype info to datGenes
datGenes = datGenes %>% left_join(datGenes_biotype, by='ensembl_gene_id') %>%
           mutate(gene_biotype = coalesce(as.character(gene_biotype.x), gene_biotype.y)) %>%
           dplyr::select(-gene_biotype.x, -gene_biotype.y)

labels_source$n_matches[2] = sum(!is.na(datGenes$gene_biotype)) - labels_source$n_matches[1]

########################################################################################
# 3. Query current BioMart version for gene_biotype using gene symbol as key

missing_genes = unique(datGenes$hgnc_symbol[is.na(datGenes$gene_biotype)])
getinfo = c('hgnc_symbol','gene_biotype')
datGenes_biotype_by_gene = getBM(attributes = getinfo, filters = c('hgnc_symbol'), mart = mart,
                                 values = missing_genes)

cat(paste0('2.3 ', length(missing_genes)-length(unique(datGenes_biotype_by_gene$hgnc_symbol)),'/',
           length(missing_genes),
           ' genes weren\'t found in the current BioMart version when querying by gene name'))
## 2.3 6295/8036 genes weren't found in the current BioMart version when querying by gene name
dups = unique(datGenes_biotype_by_gene$hgnc_symbol[duplicated(datGenes_biotype_by_gene$hgnc_symbol)])
cat(paste0('    ', length(dups), ' genes returned multiple labels (these won\'t be added)'))
##     18 genes returned multiple labels (these won't be added)
# Update information
datGenes_biotype_by_gene = datGenes_biotype_by_gene %>% filter(!hgnc_symbol %in% dups)
datGenes = datGenes %>% left_join(datGenes_biotype_by_gene, by='hgnc_symbol') %>% 
           mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
           dplyr::select(-gene_biotype.x, -gene_biotype.y)

labels_source$n_matches[3] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)

########################################################################################
# 4. Query feb2014 BioMart version for the missing biotypes

missing_ensembl_ids = unique(datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])

getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl', 
               host = 'feb2014.archive.ensembl.org')
datGenes_biotype_archive = getBM(attributes = getinfo, filters=c('ensembl_gene_id'), 
                                 values = missing_ensembl_ids, mart=mart)

cat(paste0('2.4 ', length(missing_ensembl_ids)-nrow(datGenes_biotype_archive),'/',length(missing_ensembl_ids),
             ' genes weren\'t found in the feb2014 BioMart version when querying by Ensembl ID'))
## 2.4 1219/7594 genes weren't found in the feb2014 BioMart version when querying by Ensembl ID
# Update information
datGenes = datGenes %>% left_join(datGenes_biotype_archive, by='ensembl_gene_id') %>% 
            mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
            dplyr::select(-gene_biotype.x, -gene_biotype.y)

labels_source$n_matches[4] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)

########################################################################################
# Plot results

labels_source = labels_source %>% add_row(source = 'missing', 
                                          n_matches = nrow(datGenes) - sum(labels_source$n_matches)) %>% 
                mutate(x = 1, percentage = round(100*n_matches/sum(n_matches),2),
                       source = factor(source, levels=c('BioMart2014','BioMart2020_byGene','BioMart2020_byID',
                                                        'NCBI','missing')))
                

p = labels_source %>% ggplot(aes(x, percentage, fill=source)) + geom_bar(position='stack', stat='identity') +
    theme_minimal() + coord_flip() + theme(legend.position='bottom', axis.title.y=element_blank(),
    axis.text.y=element_blank(), axis.ticks.y=element_blank()) + ylab('Percentage of genes') +
    scale_fill_manual(values = c(gg_colour_hue(nrow(labels_source)-1),'gray'))

ggplotly(p + theme(legend.position='none'))
as_ggplot(get_legend(p))

########################################################################################
# Reorder rows to match datExpr
datGenes = datGenes[match(rownames(datExpr), datGenes$ensembl_gene_id),]


rm(getinfo, mart, datGenes_BM, datGenes_biotype, datGenes_biotype_by_gene, datGenes_biotype_archive,
   dups, missing_ensembl_ids, missing_genes, labels_source, p)

Filtering



Checking how many SFARI genes are in the dataset

df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
n_SFARI = df[['gene-symbol']] %>% unique %>% length

Considering all genes, this dataset contains 910 of the 912 SFARI genes

1.- Filter entries for which we didn’t manage to obtain its genotype information

1.1 Missing Biotype

to_keep = !is.na(datGenes$gene_biotype)

datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datGenes) = datGenes$ensembl_gene_id

Removed 1219 ‘genes’, 60850 remaining

Filtering genes without biotype information, we are left with 910 SFARI Genes (we lose 0 genes)




1.2 Missing Length of the sequence

to_keep = !is.na(datGenes$length)

datExpr = datExpr[to_keep,]
datGenes = datGenes[to_keep,]

Removed 361 ‘genes’, 60489 remaining

Filtering genes without sequence length information, we are left with 910 SFARI Genes (we lose 0 genes)




2.- Filter genes that do not encode any protein


37% of the genes are protein coding genes

datGenes$gene_biotype %>% table %>% sort(decreasing=TRUE) %>% kable(caption='Biotypes of genes in dataset') %>%
                          kable_styling(full_width = F)
Biotypes of genes in dataset
. Freq
protein_coding 22113
lncRNA 11392
processed_pseudogene 9380
unprocessed_pseudogene 2461
miRNA 2212
misc_RNA 2156
snRNA 2017
1 1929
pseudogene 1353
snoRNA 1178
lincRNA 661
transcribed_unprocessed_pseudogene 654
rRNA_pseudogene 485
transcribed_processed_pseudogene 418
antisense 315
6 304
3 301
IG_V_pseudogene 168
TR_V_gene 146
IG_V_gene 134
transcribed_unitary_pseudogene 90
TR_J_gene 81
unitary_pseudogene 74
processed_transcript 59
rRNA 58
TR_V_pseudogene 46
sense_intronic 43
sense_overlapping 38
scaRNA 31
polymorphic_pseudogene 28
IG_D_gene 27
Mt_tRNA 22
IG_J_gene 18
4 17
7 16
IG_C_gene 14
TEC 9
IG_C_pseudogene 8
TR_C_gene 8
ribozyme 5
TR_J_pseudogene 5
3prime_overlapping_ncrna 4
IG_J_pseudogene 3
TR_D_gene 3
Mt_rRNA 2
8 1
translated_processed_pseudogene 1
translated_unprocessed_pseudogene 1

Most of the non-protein coding genes have very low levels of expression

plot_data = data.frame('ID' = rownames(datExpr), 'MeanExpr' = apply(datExpr, 1, mean),
                       'ProteinCoding' = datGenes$gene_biotype=='protein_coding')

ggplotly(plot_data %>% ggplot(aes(log2(MeanExpr+1), fill=ProteinCoding, color=ProteinCoding)) + 
         geom_density(alpha=0.5) + theme_minimal())
rm(plot_data)
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))

Filtering protein coding genes, we are left with 906 SFARI Genes (we lose 4 genes)

Note: The gene name for Ensembl ID ENSG00000187951 is wrong, it should be AC091057.1 instead of ARHGAP11B, but the biotype is right, so it would still be filtered out

n_SFARI = df[['gene-symbol']][df$gene_biotype=='protein_coding'] %>% unique %>% length

df %>% filter(!`gene-symbol` %in% df$`gene-symbol`[df$gene_biotype=='protein_coding']) %>% 
       dplyr::select(ID, `gene-symbol`, `gene-score`, gene_biotype, syndromic, `number-of-reports`) %>% 
       kable(caption='Lost Genes')  %>% kable_styling(full_width = F)
Lost Genes
ID gene-symbol gene-score gene_biotype syndromic number-of-reports
ENSG00000187951 ARHGAP11B 3 lncRNA 0 2
ENSG00000251593 MSNP1AS 2 processed_pseudogene 0 12
ENSG00000233067 PTCHD1-AS 2 lncRNA 0 3
ENSG00000197558 SSPO 3 transcribed_unitary_pseudogene 0 4
rm(df)
if(!all(rownames(datExpr)==rownames(datGenes))) cat('!!! gene rownames do not match!!!')

to_keep = datGenes$gene_biotype == 'protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$ensembl_gene_id
rownames(datGenes) = datGenes$ensembl_gene_id


Removed 38376 genes. 22113 remaining


3.- Filter genes with low expression levels

\(\qquad\) 3.1 Remove genes with zero expression in all of the samples

to_keep = rowSums(datExpr) > 0

df = data.frame('rowSums' = rowSums(datExpr), 'ensembl_gene_id' = rownames(datExpr)) %>%
     right_join(SFARI_genes, by='ensembl_gene_id') %>% filter(rowSums == 0 & !is.na(`gene-score`)) %>%
     arrange(`gene-score`) %>% dplyr::select(-ensembl_gene_id) %>%
     filter(!duplicated(`gene-symbol`), !`gene-symbol` %in% datGenes$hgnc_symbol[to_keep])

datExpr = datExpr[to_keep,]
datGenes = datGenes[to_keep,]


Removed 2803 genes. 19310 remaining

904 SFARI genes remaining (we lost 2 genes)

df %>% dplyr::select(ID, `gene-symbol`, `gene-score`, gene_biotype, syndromic, `number-of-reports`) %>% 
       kable(caption='Lost SFARI Genes') %>% kable_styling(full_width = F)
Lost SFARI Genes
ID gene-symbol gene-score gene_biotype syndromic number-of-reports
ENSG00000267910 KMT2A 1 protein_coding 0 20
ENSG00000235718 MFRP 2 protein_coding 0 6
ENSG00000167014 TERB2 3 protein_coding 0 1
n_SFARI = SFARI_genes[['gene-symbol']][SFARI_genes$ID %in% rownames(datExpr)] %>% unique %>% length


rm(df)


\(\qquad\) 3.2 Removing genes with a high percentage of zeros


Choosing the threshold:

Criteria for selecting the percentage of zeros threshold: The minimum value in which the preprocessed data is relatively homoscedastic (we’re trying to get rid of the group of genes with very low mean and SD that make the cloud of points look like a comic book speech bubble)

datMeta_original = datMeta
datExpr_original = datExpr
datGenes_original = datGenes
thresholds = round(100*(ncol(datExpr)-c(seq(100,7,-10),5,3,2,0))/ncol(datExpr))

for(threshold in thresholds){
  
  datMeta = datMeta_original
  datExpr = datExpr_original
  datGenes = datGenes_original
  
  to_keep = apply(datExpr, 1, function(x) 100*mean(x>0)) >= threshold
  datGenes = datGenes[to_keep,]
  datExpr = datExpr[to_keep,]
  
  # Filter outlier samples
  absadj = datExpr %>% bicor %>% abs
  netsummary = fundamentalNetworkConcepts(absadj)
  ku = netsummary$Connectivity
  z.ku = (ku-mean(ku))/sqrt(var(ku))
  
  to_keep = z.ku > -2
  datMeta = datMeta[to_keep,]
  datExpr = datExpr[,to_keep]
  
  rm(absadj, netsummary, ku, z.ku, to_keep)
  
  
  # Create a DeseqDataSet object, estimate the library size correction and save the normalized counts matrix
  counts = datExpr %>% as.matrix
  rowRanges = GRanges(datGenes$chromosome_name,
                      IRanges(start=datGenes$start_position, width=datGenes$length),
                      strand=datGenes$strand,
                      feature_id=datGenes$ensembl_gene_id)
  se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
  dds = DESeqDataSet(se, design =~Diagnosis)
  
  # Perform vst
  vsd = vst(dds)
  
  datExpr_vst = assay(vsd)
  datMeta_vst = colData(vsd)
  datGenes_vst = rowRanges(vsd)
  
  rm(counts, rowRanges, se, vsd)
  
  # Save summary results in dataframe
  if(threshold == thresholds[1]){
    mean_vs_sd_data = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
                                 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
  } else {
    new_entries = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
                                 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
    mean_vs_sd_data = rbind(mean_vs_sd_data, new_entries)
  }
}

# Visualise the effects of different thresholds
to_keep_1 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean<7] %>%
            as.character
to_keep_2 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean>=7]
to_keep_2 = to_keep_2 %>% sample(round(length(to_keep_2)/10)) %>% as.character

plot_data = mean_vs_sd_data[mean_vs_sd_data$ID %in% c(to_keep_1, to_keep_2),]

ggplotly(plot_data %>% ggplot(aes(Mean, SD)) + 
         geom_point(color='#0099cc', alpha=0.2, aes(id=ID, frame=threshold)) + 
         scale_x_log10() + scale_y_log10() + theme_minimal())
# Plot remaining genes
plot_data = mean_vs_sd_data %>% group_by(threshold) %>% tally

ggplotly(plot_data %>% ggplot(aes(threshold, n)) + geom_point() + geom_line() +
         theme_minimal() + ggtitle('Remaining genes for each filtering threshold'))
rm(to_keep_1, to_keep_2, plot_data, dds, thresholds)
# Return to original variables
datExpr = datExpr_original
datGenes = datGenes_original
datMeta = datMeta_original

rm(datExpr_original, datGenes_original, datMeta_original, datExpr_vst, datGenes_vst, datMeta_vst)


Selecting a threshold of 85

# Minimum percentage of non-zero entries allowed per gene
threshold = 85

plot_data = data.frame('id'=rownames(datExpr),
                       'non_zero_percentage' = apply(datExpr, 1, function(x) 100*mean(x>0)))

ggplotly(plot_data %>% ggplot(aes(x=non_zero_percentage)) + 
         geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) + 
         geom_vline(xintercept=threshold, color='gray') +
         xlab('% of non-zero entries') + ylab('Density') +
         ggtitle('Percentage of non-zero entries distribution') + theme_minimal())
to_keep = apply(datExpr, 1, function(x) 100*mean(x>0)) >= threshold
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]

Removed 6146 genes. 13164 remaining

767 SFARI genes remaining (we lost 137 genes)

n_SFARI = SFARI_genes[['gene-symbol']][SFARI_genes$ID %in% rownames(datExpr)] %>% unique %>% length

rm(threshold, plot_data, to_keep)


4.- Filter outlier samples

Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)

absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))

plot_data = data.frame('sample'=1:length(z.ku), 'distance'=z.ku, 'Sample_ID'=datMeta$ID, 
                       'Subject_ID'=datMeta$Subject_ID, 'Extraction_Batch'=datMeta$SiteHM,
                       'Brain_Lobe'=datMeta$Brain_lobe, 'Sex'=datMeta$Sex, 'Age'=datMeta$Age,
                       'Diagnosis'=datMeta$Diagnosis, 'PMI'=datMeta$PMI)

selectable_scatter_plot(plot_data, plot_data[,-c(1:3)])

Outlier samples: ba19.s13, ba44.s23

to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

rm(absadj, netsummary, ku, z.ku, plot_data)

Removed 2 samples, 106 remaining

rm(to_keep)





5.- Filter repeated genes

There are 2 genes with more than one ensembl ID in the dataset. To accurately refer to the rows of my data as ‘genes’, I’m going to remove the repeated ones.

dup_genes = datGenes$hgnc_symbol %>% duplicated

datGenes = datGenes[!dup_genes,]
datExpr = datExpr[!dup_genes,]

Removed 2 genes. 13162 remaining

767 SFARI genes remaining (we lost 0 genes)

rm(dup_genes, n_SFARI)


6.- Filter Samples to correct Sample’s Metadata imbalances related to Diagnosis

We have two varibles with imbalances related to Diagnosis: Brain Lobe and Processing Group, with the Occipital lobe and the Maryland processing group being biased towards the Control group.

Because the cost of balancing the samples by Processing Group is too high, I’m not going to try to do it directly, but I’m going to chose the samples I’m going to remove to balance the samples by Brain Lobe considering the Processing site to at least reduce this imbalance a little

set.seed(123)

ctl_samples = datMeta %>% filter(Diagnosis == 'CTL'& Brain_lobe == 'Occipital' & SiteHM == 'M') %>%
              sample_n(9) %>% pull(ID) %>% as.character

asd_samples = datMeta %>% filter(Diagnosis == 'ASD'& Brain_lobe == 'Frontal' & SiteHM == 'H') %>%
              sample_n(8) %>% pull(ID) %>% as.character

to_keep = !datMeta$ID %in% c(ctl_samples, asd_samples)

datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

Removing the ASD samples with ID: ba44.s28, ba10.s86, ba44.s14, ba44.s27, ba10.s16, ba44.s13, ba44.s17, ba10.s8

Removing the Control samples with ID: ba19.s70, ba19.s74, ba19.s69, ba19.s55, ba19.s64, ba19.s73, ba19.s78, ba19.s66, ba19.s58

The samples are now balanced by Brain Lobe and they are still unbalanced by Processing Group, but at least not as much as before (Harvard 2:1 and Maryland 1:3)

rm(ctl_samples, asd_samples, to_keep)

table_info = datMeta %>% apply_labels(Diagnosis = 'Diagnosis', Brain_lobe = 'Brain Lobe', 
                                      SiteHM = 'Processing Group')

cro(table_info$Diagnosis, list(table_info$Brain_lobe,total()))
 Brain Lobe     #Total 
 Frontal   Occipital   
 Diagnosis 
   CTL  19 30   49
   ASD  16 24   40
   #Total cases  35 54   89
cro(table_info$Diagnosis, list(table_info$SiteHM,total()))
 Processing Group     #Total 
 H   M   
 Diagnosis 
   CTL  13 36   49
   ASD  28 12   40
   #Total cases  41 48   89


After filtering, the dataset consists of 13162 genes and 89 samples

Save filtered and annotated dataset

save(datExpr, datMeta, datGenes, file='./../Data/filtered_raw_data.RData')
#load('./../Data/filtered_raw_data.RData')

Batch Effects Exploratory Analysis



Note: No batch correction is performed in this section, this is done after the normalisation step

According to Tackling the widespread and critical impact of batch effects in high-throughput data, technical artifacts can be an important source of variability in the data, so batch correction should be part of the standard preprocessing pipeline of gene expression data.

They say Processing group and Date of the experiment are good batch surrogates, I only have processing group, so I’m going to see if this affects the data in any clear way to use it as a surrogate.

Processing group


Data was processed by two different groups: the Harvard Brain Tissue Resource Center (H) and the NICHD Brain and Tissue Bank at the University of Maryland (M)

table_info = datMeta %>% apply_labels(SiteHM = 'Processing Group', Diagnosis = 'Diagnosis')

table_info$SiteHM %>% cro
 #Total 
 Processing Group 
   H  41
   M  48
   #Total cases  89


Samples don’t seem to cluster together that strongly by batch, and in the cases that they do align, since we know there is a relation between Processing Group and Diagnosis, we don’t which of these variables is responsible for it

h_clusts = datExpr %>% t %>% dist %>% hclust %>% as.dendrogram

create_viridis_dict = function(){
  min_age = datMeta$Age %>% min
  max_age = datMeta$Age %>% max
  viridis_age_cols = viridis(max_age - min_age + 1)
  names(viridis_age_cols) = seq(min_age, max_age)
  
  return(viridis_age_cols)
}
viridis_age_cols = create_viridis_dict()

dend_meta = datMeta[match(labels(h_clusts), datMeta$ID),] %>% 
            mutate('Site' = ifelse(SiteHM=='H', '#F8766D', '#00BFC4'),
                   'Diagnosis' = ifelse(Diagnosis=='CTL','#008080','#86b300'), # Blue control, Green ASD
                   'Sex' = ifelse(Sex=='F','#ff6666','#008ae6'),            # Pink Female, Blue Male
                   'Region' = case_when(Brain_lobe=='Frontal'~'#F8766D',        # ggplot defaults for 2 colours
                                        Brain_lobe=='Occipital'~'#00BFC4'),
                   'Age' = viridis_age_cols[as.character(Age)]) %>%             # Purple: young, Yellow: old
            dplyr::select(Age, Region, Sex, Diagnosis, Site)
h_clusts %>% dendextend::set('labels', rep('', nrow(datMeta))) %>% 
             dendextend::set('branches_k_color', k=9) %>% plot
colored_bars(colors=dend_meta)

rm(h_clusts, dend_meta, create_viridis_dict, viridis_age_cols)
  • Comparing the mean expression of each sample by batch we can see there is some batch effect differentiating them

  • We were expecting some kind of imbalance here since we know that ASD samples have a higher general level of expression when compared to the Control group

  • What we weren’t expecting to find is the Maryland group to have a higher level of expression than the Harvard group (since this group has more Control samples than ASD and the Harvard group has the opposite bias)

plot_data_b1 = data.frame('Mean'=colMeans(datExpr[,datMeta$SiteHM=='H']), 'Batch'='H')
plot_data_b2 = data.frame('Mean'=colMeans(datExpr[,datMeta$SiteHM=='M']), 'Batch'='M')

plot_data = rbind(plot_data_b1, plot_data_b2)
mu = plot_data %>% group_by(Batch) %>% dplyr::summarise(BatchMean=mean(Mean))

ggplotly(plot_data %>% ggplot(aes(x=Mean, color=Batch, fill=Batch)) + geom_density(alpha=0.3) + 
         geom_vline(data=mu, aes(xintercept=BatchMean, color=Batch), linetype='dashed') +
         ggtitle('Mean expression by sample grouped by Batch') + scale_x_log10() + theme_minimal())
rm(plot_data_b1, plot_data_b2, plot_data, mu)



Looking for unknown sources of batch effects


Following the pipeline from Surrogate variable analysis: hidden batch effects where sva is used with DESeq2:

Create a DeseqDataSet object, estimate the library size correction and save the normalized counts matrix

counts = datExpr %>% as.matrix
rowRanges = GRanges(datGenes$chromosome_name,
                  IRanges(datGenes$start_position, width=datGenes$length),
                  strand=datGenes$strand,
                  feature_id=datGenes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design = ~ Diagnosis)
## converting counts to integer mode
dds = estimateSizeFactors(dds)
norm.cts = counts(dds, normalized = TRUE)

Provide the normalized counts and two model matrices to SVA. The first matrix uses the biological condition, and the second model matrix is the null model.

mod = model.matrix(~ Diagnosis, colData(dds))
mod0 = model.matrix(~ 1, colData(dds))
sva_fit = svaseq(norm.cts, mod=mod, mod0=mod0)
## Number of significant surrogate variables is:  20 
## Iteration (out of 5 ):1  2  3  4  5
rm(mod, mod0, norm.cts)

Found 20 surrogate variables, since there is no direct way to select which ones to pick Bioconductor answer, kept all of them.

Include SV estimations to datMeta information

sv_data = sva_fit$sv %>% data.frame
colnames(sv_data) = paste0('SV', 1:ncol(sv_data))

datMeta_sva = cbind(datMeta, sv_data)

rm(sv_data, sva_fit)

In conclusion: Site could work as a surrogate for batch effects, but has the huge downside that is correlated to Diagnosis! The sva package found other 23 variables that could work as surrogates which are now included in datMeta and should be included in the DEA.




Normalisation and Differential Expression Analysis



Using DESeq2 package to perform normalisation. I chose this package over limma because limma uses the log transformed data as input instead of the raw counts and I have discovered that in this dataset, this transformation affects genes differently depending on their mean expression level, and genes with a high SFARI score are specially affected by this.

plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))

plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.1) + geom_abline(color='gray') +
              scale_x_log10() + scale_y_log10() + theme_minimal()

rm(plot_data)
counts = datExpr %>% as.matrix
rowRanges = GRanges(datGenes$chromosome_name,
                  IRanges(datGenes$start_position, width=datGenes$length),
                  strand=datGenes$strand,
                  feature_id=datGenes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta_sva)
dds = DESeqDataSet(se, design = ~ SiteHM + SV1 + SV2 + SV3 + SV4 + SV5 + SV6 + SV7 + SV8 + SV9 + 
                                  SV10 + SV11 + SV12 + SV13 + SV14 + SV15 + SV16 + SV17 + SV18 +
                                  SV19 + SV20 + Diagnosis)
## converting counts to integer mode
# Perform DEA
#dds = DESeq(dds) # Changed this for the three lines below because some rows don't converge
dds = estimateSizeFactors(dds)
dds = estimateDispersions(dds)
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
dds = nbinomWaldTest(dds, maxit=10000)
## 27 rows did not converge in beta, labelled in mcols(object)$betaConv. Use larger maxit argument with nbinomWaldTest
DE_info = results(dds, lfcThreshold=0, altHypothesis='greaterAbs')

# Perform vst
vsd = vst(dds)

datExpr_vst = assay(vsd)
datMeta_vst = colData(vsd)
datGenes_vst = rowRanges(vsd)

rm(counts, rowRanges, se, vsd)

Using the plotting function DESEq2’s manual proposes to study vst’s output it looks like the data could be homoscedastic

meanSdPlot(datExpr_vst, plot=FALSE)$gg + theme_minimal()

Plotting points individually we can notice some heteroscedasticity in the data

plot_data = data.frame('ID'=rownames(datExpr_vst), 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))

plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.2) + geom_smooth(color = 'gray') +
              scale_x_log10() + scale_y_log10() + theme_minimal()

rm(plot_data)



Rename normalised datasets to continue working with these

datExpr = datExpr_vst
datMeta = datMeta_vst %>% data.frame
datGenes = datGenes_vst

rm(datExpr_vst, datMeta_vst, datGenes_vst, datMeta_sva)




Batch Effect Correction



By including the surrogate variables in the DESeq formula we only modelled the batch effects into the DEA, but we didn’t actually correct them from the data, for that we need to use ComBat (or other equivalent package) in the already normalised data

SVA surrogate variables


In some places they say you shouldn’t correct these effects on the data because you risk losing biological variation, in others they say you should because they introduce noise to the data. The only thing everyone agrees on is that you shouldn’t remove them before performing DEA but instead include them in the model.

Based on the conclusions from Practical impacts of genomic data “cleaning” on biological discovery using surrogate variable analysis it seems like it may be a good idea to remove the batch effects from the data and not only from the DE analysis:

  • Using SVA, ComBat or related tools can increase the power to identify specific signals in complex genomic datasets (they found “greatly sharpened global and gene-specific differential expression across treatment groups”)

  • But caution should be exercised to avoid removing biological signal of interest

  • We must be precise and deliberate in the design and analysis of experiments and the resulting data, and also mindful of the limitations we impose with our own perspective

  • Open data exploration is not possible after such supervised “cleaning”, because effects beyond those stipulated by the researcher may have been removed

Comparing data with and without surrogate variable correction

# Taken from https://www.biostars.org/p/121489/#121500
correctDatExpr = function(datExpr, mod, svs) {
  X = cbind(mod, svs)
  Hat = solve(t(X) %*% X) %*% t(X)
  beta = (Hat %*% t(datExpr))
  rm(Hat)
  gc()
  P = ncol(mod)
  return(datExpr - t(as.matrix(X[,-c(1:P)]) %*% beta[-c(1:P),]))
}

pca_samples_before = datExpr %>% t %>% prcomp
pca_genes_before = datExpr %>% prcomp

# Correct
mod = model.matrix(~ Diagnosis, colData(dds))
svs = datMeta %>% dplyr::select(contains('SV')) %>% as.matrix
datExpr_corrected = correctDatExpr(as.matrix(datExpr), mod, svs)

pca_samples_after = datExpr_corrected %>% t %>% prcomp
pca_genes_after = datExpr_corrected %>% prcomp


rm(correctDatExpr)

Samples


Removing batch effects has a big impact in the distribution of the samples, separating them by diagnosis relatively well just using the first principal component (although the separation is not nearly as good as with the Gandal dataset)

pca_samples_df = rbind(data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_before$x[,1],
                                  'PC2'=pca_samples_before$x[,2], 'corrected'=0),
                       data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_after$x[,1],
                                  'PC2'=pca_samples_after$x[,2], 'corrected'=1)) %>%
                 left_join(datMeta %>% mutate('ID'=rownames(datMeta)), by='ID')

ggplotly(pca_samples_df %>% ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(aes(frame=corrected, id=ID), alpha=0.75) + 
         xlab(paste0('PC1 (corr=', round(cor(pca_samples_before$x[,1],pca_samples_after$x[,1]),2),
                     '). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,1],1),' to ',
                     round(100*summary(pca_samples_after)$importance[2,1],1))) +
         ylab(paste0('PC2 (corr=', round(cor(pca_samples_before$x[,2],pca_samples_after$x[,2]),2),
                     '). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,2],1),' to ',
                     round(100*summary(pca_samples_after)$importance[2,2],1))) +
         ggtitle('Samples') + theme_minimal())
rm(pca_samples_df)


Genes


It seems like the sva correction preserves the mean expression of the genes and erases almost everything else (although what little else remains is enough to characterise the two Diagnosis groups pretty well using only the first PC)

*Plot is done with only 10% of the genes so it’s not that heavy

pca_genes_df = rbind(data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_before$x[,1],
                                'PC2'=pca_genes_before$x[,2], 'corrected'=0, 'MeanExpr'=rowMeans(datExpr)),
                     data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_after$x[,1],
                                'PC2'=pca_genes_after$x[,2], 'corrected'=1, 'MeanExpr'=rowMeans(datExpr)))

keep_genes = rownames(datExpr) %>% sample(0.1*nrow(datExpr))

pca_genes_df = pca_genes_df %>% filter(ID %in% keep_genes)

ggplotly(pca_genes_df %>% ggplot(aes(PC1, PC2,color=MeanExpr)) + 
         geom_point(alpha=0.3, aes(frame=corrected, id=ID)) +
         xlab(paste0('PC1 (corr=', round(cor(pca_genes_before$x[,1],pca_genes_after$x[,1]),2),
                     '). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,1],1),' to ',
                     round(100*summary(pca_genes_after)$importance[2,1],1))) +
         ylab(paste0('PC2 (corr=', round(cor(pca_genes_before$x[,2],pca_genes_after$x[,2]),2),
                     '). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,2],1),' to ',
                     round(100*summary(pca_genes_after)$importance[2,2],1))) +
         scale_color_viridis() + ggtitle('Genes') + theme_minimal())
rm(pca_samples_before, pca_genes_before, mod, svs, pca_samples_after, pca_genes_after, pca_genes_df, keep_genes)

Everything looks good, so we’re keeping the corrected expression dataset

datExpr = datExpr_corrected

rm(datExpr_corrected)


Processing Group


Even after correcting the dataset for the surrogate variables found with sva, there is still a difference in mean expression by processing, but this time, the Harvard Group has a higher level of expression than the Maryland one, which agrees with their ASD-Control sample imbalance

plot_data_b1 = data.frame('Mean'=colMeans(datExpr[,datMeta$SiteHM=='H']), 'Batch'='H')
plot_data_b2 = data.frame('Mean'=colMeans(datExpr[,datMeta$SiteHM=='M']), 'Batch'='M')

plot_data = rbind(plot_data_b1, plot_data_b2)
mu = plot_data %>% group_by(Batch) %>% dplyr::summarise(BatchMean=mean(Mean)) %>% ungroup

ggplotly(plot_data %>% ggplot(aes(x=Mean, color=Batch, fill=Batch)) + geom_density(alpha=0.3) + 
         geom_vline(data=mu, aes(xintercept=BatchMean, color=Batch), linetype='dashed') +
         ggtitle('Mean expression by sample grouped by processing date') + scale_x_log10() + theme_minimal())
rm(plot_data_b1, plot_data_b2, plot_data, mu)


Performing Batch Correction for Processing Group



I will save the batch corrected dataset as a different dataset because of the correlation between processing group and diagnosis and I will use the dataset without the batch correction for the rest of the pipeline to avoid losing Diagnosis information when removing the Processing Group signal from the data (in the end, since I’m using this dataset only as a backup for Gandal’s dataset, it’s better to have false positives in the genes with relation to Diagnosis than false negatives, which would be what could potentially happen if we use the batch corrected dataset)

https://support.bioconductor.org/p/50983/

datExpr_combat = datExpr %>% as.matrix %>% ComBat(batch=datMeta$SiteHM)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

Now both batches have similar mean expression

plot_data_b1 = data.frame('Mean'=colMeans(datExpr_combat[,datMeta$SiteHM=='H']), 'Batch'='H')
plot_data_b2 = data.frame('Mean'=colMeans(datExpr_combat[,datMeta$SiteHM=='M']), 'Batch'='M')

plot_data = rbind(plot_data_b1, plot_data_b2)
mu = plot_data %>% group_by(Batch) %>% dplyr::summarise(BatchMean=mean(Mean)) %>% ungroup

ggplotly(plot_data %>% ggplot(aes(x=Mean, color=Batch, fill=Batch)) + geom_density(alpha=0.3) + 
         geom_vline(data=mu, aes(xintercept=BatchMean, color=Batch), linetype='dashed') +
         ggtitle('Mean expression by sample grouped by processing date') + scale_x_log10() + theme_minimal())
rm(plot_data_b1, plot_data_b2, plot_data, mu)



Save preprocessed datasets with and without ComBat correction

save(datExpr, datMeta, datGenes, DE_info, dds, file='./../Data/preprocessed_data.RData')
save(datExpr_combat, datMeta, datGenes, DE_info, dds, file='./../Data/preprocessed_data_ComBat.RData')




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.1.0            knitr_1.28                 
##  [3] expss_0.10.2                dendextend_1.13.4          
##  [5] vsn_3.52.0                  WGCNA_1.69                 
##  [7] fastcluster_1.1.25          dynamicTreeCut_1.63-1      
##  [9] sva_3.32.1                  genefilter_1.66.0          
## [11] mgcv_1.8-31                 nlme_3.1-147               
## [13] DESeq2_1.24.0               SummarizedExperiment_1.14.1
## [15] DelayedArray_0.10.0         BiocParallel_1.18.1        
## [17] matrixStats_0.56.0          Biobase_2.44.0             
## [19] GenomicRanges_1.36.1        GenomeInfoDb_1.20.0        
## [21] IRanges_2.18.3              S4Vectors_0.22.1           
## [23] BiocGenerics_0.30.0         biomaRt_2.40.5             
## [25] ggpubr_0.2.5                magrittr_1.5               
## [27] ggExtra_0.9                 GGally_1.5.0               
## [29] gridExtra_2.3               viridis_0.5.1              
## [31] viridisLite_0.3.0           RColorBrewer_1.1-2         
## [33] plotlyutils_0.0.0.9000      plotly_4.9.2               
## [35] glue_1.4.1                  reshape2_1.4.4             
## [37] forcats_0.5.0               stringr_1.4.0              
## [39] dplyr_1.0.0                 purrr_0.3.4                
## [41] readr_1.3.1                 tidyr_1.1.0                
## [43] tibble_3.0.1                ggplot2_3.3.2              
## [45] tidyverse_1.3.0            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.1.8        Hmisc_4.4-0           
##   [4] plyr_1.8.6             lazyeval_0.2.2         splines_3.6.3         
##   [7] crosstalk_1.1.0.1      digest_0.6.25          foreach_1.5.0         
##  [10] htmltools_0.4.0        GO.db_3.8.2            fansi_0.4.1           
##  [13] checkmate_2.0.0        memoise_1.1.0          doParallel_1.0.15     
##  [16] cluster_2.1.0          limma_3.40.6           annotate_1.62.0       
##  [19] modelr_0.1.6           prettyunits_1.1.1      jpeg_0.1-8.1          
##  [22] colorspace_1.4-1       blob_1.2.1             rvest_0.3.5           
##  [25] haven_2.2.0            xfun_0.12              hexbin_1.28.1         
##  [28] crayon_1.3.4           RCurl_1.98-1.2         jsonlite_1.7.0        
##  [31] impute_1.58.0          iterators_1.0.12       survival_3.1-12       
##  [34] gtable_0.3.0           zlibbioc_1.30.0        XVector_0.24.0        
##  [37] webshot_0.5.2          scales_1.1.1           DBI_1.1.0             
##  [40] miniUI_0.1.1.1         Rcpp_1.0.4.6           xtable_1.8-4          
##  [43] progress_1.2.2         htmlTable_1.13.3       foreign_0.8-76        
##  [46] bit_1.1-15.2           preprocessCore_1.46.0  Formula_1.2-3         
##  [49] htmlwidgets_1.5.1      httr_1.4.1             acepack_1.4.1         
##  [52] ellipsis_0.3.1         farver_2.0.3           pkgconfig_2.0.3       
##  [55] reshape_0.8.8          XML_3.99-0.3           nnet_7.3-14           
##  [58] dbplyr_1.4.2           locfit_1.5-9.4         labeling_0.3          
##  [61] tidyselect_1.1.0       rlang_0.4.6            later_1.0.0           
##  [64] AnnotationDbi_1.46.1   munsell_0.5.0          cellranger_1.1.0      
##  [67] tools_3.6.3            cli_2.0.2              generics_0.0.2        
##  [70] RSQLite_2.2.0          broom_0.5.5            evaluate_0.14         
##  [73] fastmap_1.0.1          yaml_2.2.1             bit64_0.9-7           
##  [76] fs_1.4.0               mime_0.9               xml2_1.2.5            
##  [79] compiler_3.6.3         rstudioapi_0.11        curl_4.3              
##  [82] png_0.1-7              affyio_1.54.0          ggsignif_0.6.0        
##  [85] reprex_0.3.0           geneplotter_1.62.0     stringi_1.4.6         
##  [88] highr_0.8              lattice_0.20-41        Matrix_1.2-18         
##  [91] vctrs_0.3.1            pillar_1.4.4           lifecycle_0.2.0       
##  [94] BiocManager_1.30.10    cowplot_1.0.0          data.table_1.12.8     
##  [97] bitops_1.0-6           httpuv_1.5.2           affy_1.62.0           
## [100] R6_2.4.1               latticeExtra_0.6-29    promises_1.1.0        
## [103] codetools_0.2-16       assertthat_0.2.1       withr_2.2.0           
## [106] GenomeInfoDbData_1.2.1 hms_0.5.3              grid_3.6.3            
## [109] rpart_4.1-15           rmarkdown_2.1          shiny_1.4.0.2         
## [112] lubridate_1.7.4        base64enc_0.1-3